Goal

from a selection of possible subset/chosen ESMs, come up with a metric for each of those subsets that says how good the subset is at representing the ‘truth’ that is the PCA of all of the CMIP6 data.

Then select the subset of ESMs with the smallest value for that metric and those are our subset ESMs.

Work with this metric:

The coefficients of all CMIP6 data projected into the first N eigenvectors from the PCA of all data.

For each of the OOS esmXscenarios, for each subset ESM, calculate the smallest L2 distance from the coefficients across the subset ESM’s scenarios= how close each subset ESM can get to every out of sample data point.

Then take the minimum across subset ESMs = how close the selected subset can get to each OOS data point.

The average of that distance from all OOS data points is then the metric for that subset.

set up

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)

metrics_dir <-  'extracted_timeseries/extracted_metrics/'
write_dir <- 'subset_summary_metrics/'
fig_dir <- 'figures/'

# ESMS that dont have 4 experiments
non_candidate_esm <- c("CIESM", 
                       "E3SM-1-1", "FIO-ESM-2-0", "GFDL-CM4", "IITM-ESM",
                       "IPSL-CM5A2-INCA", "KIOST-ESM", "NESM3", "NorESM2-LM")
            

# Figure sizing info
title_size <- 14
axis_size <- 12

Load ESM data that’s been processed

region_summary_main <- read.csv(paste0(metrics_dir, 'IPCC_land_regions_metrics.csv'), 
                                stringsAsFactors = FALSE)
region_summary_main %>%
  filter(experiment != 'ssp119',
         experiment != 'ssp434',
         experiment != 'ssp460',
         experiment != 'ssp534-over',
         !(esm %in% non_candidate_esm))  %>% # remove from the full set of data 
                                             # that creates our 'true' CMIP6 PCA space
    rename(region = acronym) ->
  region_summary 

print(head(region_summary))
##          esm experiment ensemble variable       type              name region
## 1 ACCESS-CM2     ssp126 r1i1p1f1       pr       Land Arabian-Peninsula    ARP
## 2 ACCESS-CM2     ssp126 r1i1p1f1       pr       Land    Central-Africa    CAF
## 3 ACCESS-CM2     ssp126 r1i1p1f1       pr Land-Ocean         Caribbean    CAR
## 4 ACCESS-CM2     ssp126 r1i1p1f1       pr       Land       C.Australia    CAU
## 5 ACCESS-CM2     ssp126 r1i1p1f1       pr       Land   C.North-America    CNA
## 6 ACCESS-CM2     ssp126 r1i1p1f1       pr       Land            E.Asia    EAS
##           iasd      end_val   end_anomaly end_anomaly_pct mid_century_val
## 1 7.227917e-07 1.700625e-06  4.306991e-07     0.339153029    2.051770e-06
## 2 3.009324e-06 4.828026e-05 -4.965491e-08    -0.001027416    4.941042e-05
## 3 6.106140e-06 3.052769e-05  2.089845e-07     0.006892922    3.429079e-05
## 4 2.830779e-06 1.130480e-05 -1.269868e-06    -0.100986213    1.126320e-05
## 5 2.861294e-06 3.232150e-05  1.953880e-06     0.064340892    3.222543e-05
## 6 2.687448e-06 5.449186e-05  5.826705e-06     0.119730531    5.260652e-05
##     mid_anomaly mid_anomaly_pct
## 1  7.818446e-07      0.61566180
## 2  1.080502e-06      0.02235680
## 3  3.972086e-06      0.13101110
## 4 -1.311461e-06     -0.10429388
## 5  1.857811e-06      0.06117738
## 6  3.941364e-06      0.08098944

Prep data for PCA

# reshape:
grouped_data <- split(region_summary, list(region_summary$esm, region_summary$experiment))
grouped_data <- grouped_data[sapply(grouped_data, function(x) dim(x)[1]) > 0]

shaped_data <- lapply(grouped_data, function(group){
    if (nrow(group) >0) {
      group %>%
        filter(variable == 'tas') %>% 
        select(esm, experiment, ensemble,type, region,
               iasd, end_anomaly, mid_anomaly) %>%
        rename(tas_iasd = iasd,
               tas_end_anomaly = end_anomaly,     
               tas_mid_anomaly = mid_anomaly) %>% 
        left_join(group %>%
                      filter(variable == 'pr') %>% 
                      select(esm, experiment, ensemble,type, region,
                             iasd, end_anomaly_pct, mid_anomaly_pct) %>%
                      rename(pr_iasd = iasd,
                             pr_end_anomaly_pct = end_anomaly_pct,    
                             pr_mid_anomaly_pct = mid_anomaly_pct),
                  by = c('esm','experiment', 'ensemble', 'type', 'region')
               ) ->
    reshaped
    
    reshaped %>%
        group_by(esm, experiment, type, region) %>%
        summarize(tas_iasd=mean(tas_iasd),
                  tas_end_anomaly=mean(tas_end_anomaly, na.rm = T),
                  tas_mid_anomaly = mean(tas_mid_anomaly, na.rm = T),
                  pr_iasd = mean(pr_iasd, na.rm = T),
                  pr_end_anomaly_pct = mean(pr_end_anomaly_pct, na.rm = T),
                  pr_mid_anomaly_pct = mean(pr_mid_anomaly_pct, na.rm = T) ) %>%
        ungroup() %>%
        mutate(ensemble = 'ensemble_avg') -> #%>%
        #bind_rows(reshaped) ->
        reshaped2
    
    #TODO need to change shaping if including ensemble members
    reshaped2 %>%
        select(-type) %>%
        gather(metric, value, -esm, -experiment, -ensemble, -region) %>%
        mutate(row_id = paste0(region, '~', metric),
           col_id = paste0(esm, '~', experiment, '~', ensemble)) %>%
        select(-esm, -experiment,-ensemble, -region, -metric) %>%
        as.data.frame() ->
    reshaped3 
  
  colnames(reshaped3) <- c(paste0(unique(reshaped3$col_id)), 'row_id', 'col_id')
  rownames(reshaped3) <- paste0(reshaped3$row_id)
  
  reshaped3 %>% 
    select(-col_id, -row_id) ->
    out
  
  return(out)  
    }
    
    }
)

# combine columns but then transpose because prcomp wants rows to be observations 
# and columns to be variables (but doing it the other way first is easier to code)
full_data <- t(do.call(cbind, shaped_data) )


# Drop anything with missing data
full_data1 <- na.omit(full_data)
print('removed rows due to missing data')
## [1] "removed rows due to missing data"
print(row.names(full_data)[c(which(!(row.names(full_data) %in% row.names(full_data1))))])
## character(0)
full_data <- as.data.frame(full_data1)
rm(full_data1)

summary figure of raw data

full_data %>%
    mutate(id = row.names(.)) %>%
    separate(id, into = c('model', 'scenario', 'trash'), sep= '~') %>%
    select(-trash) %>%
    gather(id2, value, -model, -scenario) %>%
    separate(id2, into = c('region', 'metric'), sep = '~') ->
    raw_plt


p_raw <- ggplot(data = raw_plt) +
    geom_point(mapping = aes(x = region, y = value,
                             color = model, shape = scenario)) +
    ggtitle('Projection of ESM data into the first five eigenvectors') +
    facet_wrap(~metric, scale = 'free_y') +
    theme_bw()+
    theme(strip.text = element_text(size = axis_size),
          plot.title = element_text(size=title_size),
          legend.text = element_text(size = axis_size-4),
          legend.key.width = unit(0.3, 'cm'),
          axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
    xlab('') + ylab('')+ guides(color="none", shape = 'none')

ggsave(paste0(fig_dir, 'full_data_raw_plot.png'),p_raw, width = 10, height =6, units = 'in')
ggsave(paste0(fig_dir, 'full_data_raw_plot.jpg'), p_raw, width =  10, height = 6, units = 'in')

p_raw

PCA

cmip6 ‘truth’

full_pca <- prcomp(full_data, center=TRUE, scale = TRUE)


# eigenspace from the PCA
full_eigenval <- data.frame(eigenvalues=(full_pca$sdev)^2)
full_eigenvec <- full_pca$rotation

write.csv(full_eigenval, 'orig_eval.csv')
write.csv(full_eigenvec, 'orig_evec.csv')

# skree
var_explained_df <- data.frame(var_explained=(full_pca$sdev)^2/sum((full_pca$sdev)^2)) %>%
    mutate(PC = as.integer(row.names(.)))

var_explained_df %>%
  ggplot(aes(x=PC,y=var_explained, group=1))+
  geom_point(size=4)+
  geom_line()+
  labs(title="Scree plot: PCA on scaled data - all ESMs")

pfull <- var_explained_df %>%
    filter(PC <=15)%>%
  ggplot(aes(x=PC,y=var_explained, group=1))+
  geom_point(size=4)+
  geom_line()+
  labs(title="Scree plot: PCA on scaled data - all ESMs") +
    xlab('eigenvector') + ylab('fraction of total variance explained') +
    theme(axis.title = element_text(size = axis_size), 
          plot.title = element_text(size = title_size))

ggsave(paste0(fig_dir, 'scree_fulldata.png'), pfull, width = 8, height = 6, units = 'in')
ggsave(paste0(fig_dir, 'scree_fulldata.jpg'), pfull, width = 8, height = 6, units = 'in')

pfull 

-> first five eigenvalues focus

Spatial plots of eigenvectors

as.data.frame(full_eigenvec[,1:5]) %>%
    mutate(id = row.names(.)) %>%
    separate(id, into = c('region', 'index'), sep = '~')  %>%
    gather(PC, value, -region, -index) ->
    evec
## Linking to GEOS 3.12.0, GDAL 3.7.2, PROJ 9.3.0; sf_use_s2() is TRUE
## Reading layer `IPCC-WGI-reference-regions-v4' from data source 
##   `/Users/snyd535/Documents/GitHub/SnyderEtAl2023_uncertainty_informed_curation_metarepo/IPCC-WGI-reference-regions-v4_shapefile/IPCC-WGI-reference-regions-v4.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 58 features and 4 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -180 ymin: -90 xmax: 180 ymax: 90
## Geodetic CRS:  WGS 84

compare coefficients in full PCA space

from the set of available coefficients in the full PCA space, select models whose coefficients span the space in the sense of:

  • every ESM is ‘close enough’ to one of the selected models in terms of L2 distance of coefficients

  • Making sure each oosESM-Exp combo being similar to one chosenESM-Exp combo

functionalized

get_subset_data <- function(esmsubset, fulldata){
    as.data.frame(fulldata) %>%
        mutate(temp = row.names(.)) %>%
        separate(temp, into=c('esm', 'trash'), sep = '~', extra='merge') %>%
        filter(esm %in% esmsubset) %>% 
        select(-esm, -trash) %>%
        as.matrix(.) ->
        subdata
    
    return(subdata)
}


full_pca_distances <- function(subset_esms, fullpca, n_eigenvalues = 5){

    coeff <- as.data.frame(fullpca$x[,1:n_eigenvalues]) 
    
    # reshape so have info by ESM and experiment
    coeff %>%
        mutate(id = row.names(.)) %>%
        separate(id, into = c('esm', 'scenario', 'ensemble'), sep ='~') %>%
        gather(pc, value, -esm, -scenario, -ensemble) -> # %>%
        # spread(scenario, value) ->
        coeff_all_ESM
    
    # pull off the subset coefficients into their own DF for comparison
    coeff_all_ESM %>%
        filter(esm %in% subset_esms)  %>%
        spread(scenario, value) ->
        coeff_sub_ESM
    
    out <- data.frame()
    for (esmname in unique(coeff_all_ESM$esm)){
        
        # only calculate for oos ESMs
        if(!(esmname %in% subset_esms)){
            
            coeff_all_ESM %>%
                filter(esm == esmname) ->
                oos 
            
            # For each subset ESM, compare each oos scenario with all 4 scenarios
            # and record the smallest. Then have a data frame with how close each
            # subset ESM can get to each scenario of the oos ESM.
            oos %>%
                left_join(coeff_sub_ESM %>%    
                              select(-ensemble) %>%
                              rename(subset_ESM = esm), 
                          by = 'pc') %>%
                group_by(esm, scenario, ensemble, subset_ESM) %>%
                summarise(closest_L2 = min( sqrt(sum((value-ssp126)^2)),
                                            sqrt(sum((value-ssp245)^2)),
                                            sqrt(sum((value-ssp370)^2)),
                                            sqrt(sum((value-ssp585)^2))))  %>%
                ungroup %>% 
                # Then filter to just the closest of any subset ESM
                # on each oos scenario
                group_by(esm, scenario, ensemble) %>%
                filter(closest_L2 == min(closest_L2)) %>%
                ungroup %>%
                bind_rows(out, .) ->
                out
            }        # end calculation for each subset esm
        
        } # end loop over all ESMs
    
    # take average L2 for all oos esms - that's the characteristic value for this
    # particular subset. Then pick the subset that has smallest characteristic value.
    
    return(data.frame(avg_closest_L2 = mean(out$closest_L2)))
    
}


########

all subsets

nESMs <- 5

# Get the set of all possible combinations of 5 ESMs
region_summary %>%
    select(esm) %>%
    distinct %>% 
    filter(!(esm %in% non_candidate_esm)) ->
    df

as.data.frame(t(apply(combn(nrow(df), nESMs), 
                      2, function(i) df[i,])
                )
              ) %>%
    mutate(subset_id = paste0('subset', as.integer(row.names(.)))) %>%
    gather(trash, subset, -subset_id) %>%
    select(-trash) %>%
    arrange(subset_id) ->
    table_of_subsets
write.csv(table_of_subsets, paste0(write_dir,'subsets_22choose5.csv'), row.names = FALSE)

# Same data but reshaped
table_of_subsets <- read.csv(paste0(write_dir,'subsets_22choose5.csv')) 
table_of_subsets %>%
    mutate(mod = as.integer(row.names(.)) %% 5,
           esm=paste0('esm', mod)) %>% 
    select(-mod) %>% 
    spread(esm, subset) %>% 
    mutate(id = paste(esm1, esm2, esm3, esm4, esm0, sep='~')) ->
    table_of_subsets2

# Table of ECS preserving subsets
ecs_subsets <- read.csv("additional_data/five_sample_esm_climate_sensitivity.csv",
                        stringsAsFactors = F)
ecs_subsets %>%
    mutate(id = paste(esm1, esm2, esm3, esm4, esm5, sep='~')) %>%
    gather(esm_id, esm, -cs1, -cs2, -cs3, -cs4, -cs5, -id ) %>% 
    gather(cs_id, cs, -esm_id, -esm, -id) %>% 
    mutate(esm_id2 = substr(esm_id, nchar(esm_id), nchar(esm_id)),
           cs_id2 = substr(cs_id, nchar(cs_id), nchar(cs_id))) %>% 
    filter(esm_id2 == cs_id2) %>% 
    select(-esm_id, -cs_id, -esm_id2, -cs_id2) %>%
    arrange(id) %>% 
    left_join(table_of_subsets2 %>% select(subset_id, id), by = 'id') %>% 
    select(subset_id, esm, cs) %>%
    rename(subset=esm) %>%
    na.omit()->
    ecs_subsets_clean
# For every subset, calculate the metrics
metrics <- data.frame()
i <- 1
for (subsetid in unique(table_of_subsets$subset_id)){
    print(subsetid)
    print(i)
    
    subset <- table_of_subsets[table_of_subsets['subset_id']==subsetid,]$subset

    get_subset_data(esmsubset = subset,
                    fulldata = full_data) ->
        subdata
    
    sub_pca <- prcomp(subdata, center=TRUE, scale = TRUE)
    
    data.frame(subset_id = subsetid,
               coeff_metric = suppressWarnings(suppressMessages(full_pca_distances(subset_esms = subset,
                                                 fullpca = full_pca,
                                                 n_eigenvalues = 5)))) %>%
        bind_rows(metrics, .)->
        metrics
               
    rm(subset)
    rm(subdata)
    rm(sub_pca)
    
    i <- i+1
    
}

write.csv(metrics, paste0(write_dir,'esm_subsets_metrics_22choose5.csv'), row.names = FALSE)

Take a look at minimizing sets of ESMs

# Metrics for every subset with IDs
metrics <- read.csv(paste0(write_dir,'esm_subsets_metrics_22choose5.csv')) %>%
    rename(coefficient_metric = avg_closest_L2)

# Table of all subsets with IDs
table_of_subsets <- read.csv(paste0(write_dir,'subsets_22choose5.csv'))
table_of_subsets %>%
    mutate(mod = as.integer(row.names(.)) %% 5,
           esm=paste0('esm', mod)) %>% 
    select(-mod) %>% 
    spread(esm, subset) %>% 
    mutate(id = paste(esm1, esm2, esm3, esm4, esm0, sep='~')) ->
    table_of_subsets2

# Table of ECS preserving subsets
ecs_subsets <- read.csv("additional_data/five_sample_esm_climate_sensitivity.csv", 
                        stringsAsFactors = F)
ecs_subsets %>%
    mutate(id = paste(esm1, esm2, esm3, esm4, esm5, sep='~')) %>%
    gather(esm_id, esm, -cs1, -cs2, -cs3, -cs4, -cs5, -id ) %>% 
    gather(cs_id, cs, -esm_id, -esm, -id) %>% 
    mutate(esm_id2 = substr(esm_id, nchar(esm_id), nchar(esm_id)),
           cs_id2 = substr(cs_id, nchar(cs_id), nchar(cs_id))) %>% 
    filter(esm_id2 == cs_id2) %>% 
    select(-esm_id, -cs_id, -esm_id2, -cs_id2) %>%
    arrange(id) %>% 
    left_join(table_of_subsets2 %>% select(subset_id, id), by = 'id') %>% 
    select(subset_id, esm, cs) %>%
    rename(subset=esm) %>%
    na.omit() ->
    ecs_subsets_clean

minimize across all possible combinations

# Subset that is minimum in the projection metric across all possible subsets
metrics %>%
    filter(coefficient_metric == min(coefficient_metric)) %>%
    left_join(table_of_subsets, by = c('subset_id')) ->
    coefficient_min
hist(metrics$coefficient_metric)

knitr::kable(coefficient_min)
subset_id avg_nrms_across_esms coefficient_metric subset
subset11961 0.6273168 5.404571 BCC-CSM2-MR
subset11961 0.6273168 5.404571 CESM2
subset11961 0.6273168 5.404571 CMCC-ESM2
subset11961 0.6273168 5.404571 MRI-ESM2-0
subset11961 0.6273168 5.404571 UKESM1-0-LL

minimize across ECS-distribution

# Subset that is minimum in the projection metric across all possible subsets
metrics %>%
    left_join(ecs_subsets_clean, by = c('subset_id')) %>%
    na.omit %>%
    filter(coefficient_metric == min(coefficient_metric)) ->
    coefficient_min


hist((metrics %>%
    left_join(ecs_subsets_clean, by = c('subset_id')) %>%
    na.omit)$coefficient_metric)

knitr::kable(coefficient_min)
subset_id avg_nrms_across_esms coefficient_metric subset cs
subset159 0.7677227 6.355302 ACCESS-CM2 4.7
subset159 0.7677227 6.355302 ACCESS-ESM1-5 3.9
subset159 0.7677227 6.355302 BCC-CSM2-MR 3.0
subset159 0.7677227 6.355302 MIROC6 2.6
subset159 0.7677227 6.355302 MRI-ESM2-0 3.2

plotting in full PCA space

full_pca$x %>%
  as.data.frame() %>%
  mutate(id = row.names(.)) %>%
  separate(id, into=c('model', 'scenario', 'ensemble'), sep = '~') %>%
  select(model, scenario, PC1, PC2, PC3, PC4, PC5)->
  coordinates

all PCs

coordinates %>%
    gather(pc_x, pc_x_val, -model, -scenario)  %>%
    left_join(coordinates %>% 
                  gather(pc_y, pc_y_val, -model, -scenario),
               by = c('model', 'scenario'))  %>%
    mutate(x_id = as.integer(substr(pc_x, nchar(pc_x), nchar(pc_x))),
           y_id = as.integer(substr(pc_y, nchar(pc_y), nchar(pc_y)))) %>%
    filter(y_id > x_id) %>%
    # add frac variance explained by each PC to labels:
    left_join(var_explained_df %>%
                  mutate(PC = paste0('PC', PC), 
                         var_explained = round(100*var_explained,
                                               digits = 1)),
              by = c('pc_x' = 'PC')) %>%
    mutate(pc_x = paste0(pc_x, ' (',var_explained, '%)'))   %>%
    select(-var_explained) %>%
    left_join(var_explained_df %>%
                  mutate(PC = paste0('PC', PC), 
                         var_explained = round(100*var_explained,
                                               digits = 1)),
              by = c('pc_y' = 'PC')) %>%
    mutate(pc_y = paste0(pc_y, ' (',var_explained, '%)'))   %>%
    select(-var_explained) ->
    grid_plot
## Warning in left_join(., coordinates %>% gather(pc_y, pc_y_val, -model, -scenario), : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
p_coefs <- ggplot(data = grid_plot) +
    geom_point(mapping = aes(x = pc_x_val, y = pc_y_val,
                             color = model, shape = scenario)) +
    #  geom_point(data = grid_plot %>% filter(model %in% coefficient_min$subset),
    #            mapping = aes(x = pc_x_val, y = pc_y_val, color = model, shape = scenario),
    #            size = 2) +
    ggtitle('Projection of ESM data into the first five eigenvectors') +
    facet_grid(pc_y ~ pc_x, switch = 'both') +
    theme(strip.text = element_text(size = axis_size),
          plot.title = element_text(size=title_size),
          legend.text = element_text(size = axis_size-4),
          legend.key.width = unit(0.3, 'cm')) +
    xlab('') + ylab('')

ggsave(paste0(fig_dir, 'full_data_coeff_grid_plot.png'), width = 8, height =6, units = 'in')
ggsave(paste0(fig_dir, 'full_data_coeff_grid_plot.jpg'), width = 8, height =6, units = 'in')

p_coefs

p_coefs <- ggplot(data = grid_plot) +
    geom_point(mapping = aes(x = pc_x_val, y = pc_y_val,
                             color = model, shape = scenario)) +
     geom_point(data = grid_plot %>% filter(model %in% coefficient_min$subset),
               mapping = aes(x = pc_x_val, y = pc_y_val),
               color = 'black', shape = 0, size = 2) +
    ggtitle('Projection of ESM data into the first five eigenvectors') +
    facet_grid(pc_y ~ pc_x, switch = 'both') +
    theme(strip.text = element_text(size = axis_size),
          plot.title = element_text(size=title_size),
          legend.text = element_text(size = axis_size-4),
          legend.key.width = unit(0.3, 'cm')) +
    xlab('') + ylab('')

ggsave(paste0(fig_dir, 'full_data_coeff_grid_plot_subset.png'), width = 8, height =6, units = 'in')
ggsave(paste0(fig_dir, 'full_data_coeff_grid_plot_subset.jpg'), width = 8, height =6, units = 'in')

p_coefs

Cut six obvious dependecies

cut_dependent <- c('ACCESS-CM2', 'CESM2-WACCM', 'CMCC-CM2-SR5', 
                   'FGOALS-f3-L', 'INM-CM4-8', 'MPI-ESM1-2-HR')

repeat coef plot without and drop some dependencies

coordinates %>%
    filter(!(model %in% cut_dependent)) %>%
    gather(pc_x, pc_x_val, -model, -scenario)  %>%
    left_join(coordinates %>% 
                  gather(pc_y, pc_y_val, -model, -scenario),
               by = c('model', 'scenario'))  %>%
    mutate(x_id = as.integer(substr(pc_x, nchar(pc_x), nchar(pc_x))),
           y_id = as.integer(substr(pc_y, nchar(pc_y), nchar(pc_y)))) %>%
    filter(y_id > x_id) %>%
    # add frac variance explained by each PC to labels:
    left_join(var_explained_df %>%
                  mutate(PC = paste0('PC', PC), 
                         var_explained = round(100*var_explained,
                                               digits = 1)),
              by = c('pc_x' = 'PC')) %>%
    mutate(pc_x = paste0(pc_x, ' (',var_explained, '%)'))   %>%
    select(-var_explained) %>%
    left_join(var_explained_df %>%
                  mutate(PC = paste0('PC', PC), 
                         var_explained = round(100*var_explained,
                                               digits = 1)),
              by = c('pc_y' = 'PC')) %>%
    mutate(pc_y = paste0(pc_y, ' (',var_explained, '%)'))   %>%
    select(-var_explained) ->
    grid_plot
## Warning in left_join(., coordinates %>% gather(pc_y, pc_y_val, -model, -scenario), : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 2 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
p_coefs2 <- ggplot(data = grid_plot) +
    geom_point(mapping = aes(x = pc_x_val, y = pc_y_val,
                             color = model, shape = scenario)) +
    #  geom_point(data = grid_plot %>% filter(model %in% coefficient_min$subset),
    #            mapping = aes(x = pc_x_val, y = pc_y_val, color = model, shape = scenario),
    #            size = 2) +
    ggtitle('Projection of ESM data into the first five eigenvectors, cut 6 model') +
    facet_grid(pc_y ~ pc_x, switch = 'both') +
    theme(strip.text = element_text(size = axis_size),
          plot.title = element_text(size=title_size),
          legend.text = element_text(size = axis_size-4),
          legend.key.width = unit(0.3, 'cm')) +
    xlab('') + ylab('')

ggsave(paste0(fig_dir, 'full_data_coeff_grid_plot_cut6.png'), width = 8, height =6, units = 'in')
ggsave(paste0(fig_dir, 'full_data_coeff_grid_plot_cut6.jpg'), width = 8, height =6, units = 'in')

p_coefs2

Select minimizing subset from those without one of 6

minimize across ECS-distribution

# Subset that is minimum in the projection metric across all possible subsets
metrics %>%
    left_join(ecs_subsets_clean, by = c('subset_id')) %>%
    na.omit %>%
    group_by(subset_id) %>%
    mutate(drop6 = if_else(subset %in% cut_dependent, 1, 0),
           total = sum(drop6))%>%
    ungroup  %>%
    filter(total == 0) %>%
     filter(coefficient_metric == min(coefficient_metric)) ->
    coefficient_min


hist((metrics %>%
    left_join(ecs_subsets_clean, by = c('subset_id')) %>%
    na.omit)$coefficient_metric)

knitr::kable(coefficient_min)
subset_id avg_nrms_across_esms coefficient_metric subset cs drop6 total
subset6942 0.7634793 6.566167 ACCESS-ESM1-5 3.9 0 0
subset6942 0.7634793 6.566167 BCC-CSM2-MR 3.0 0 0
subset6942 0.7634793 6.566167 MIROC6 2.6 0 0
subset6942 0.7634793 6.566167 MRI-ESM2-0 3.2 0 0
subset6942 0.7634793 6.566167 NorESM2-MM 2.5 0 0
coordinates %>%
    filter(!(model %in% cut_dependent)) %>%
    gather(pc_x, pc_x_val, -model, -scenario)  %>%
    left_join(coordinates %>% 
                  gather(pc_y, pc_y_val, -model, -scenario),
               by = c('model', 'scenario'))  %>%
    mutate(x_id = as.integer(substr(pc_x, nchar(pc_x), nchar(pc_x))),
           y_id = as.integer(substr(pc_y, nchar(pc_y), nchar(pc_y)))) %>%
    filter(y_id > x_id) %>%
    # add frac variance explained by each PC to labels:
    left_join(var_explained_df %>%
                  mutate(PC = paste0('PC', PC), 
                         var_explained = round(100*var_explained,
                                               digits = 1)),
              by = c('pc_x' = 'PC')) %>%
    mutate(pc_x = paste0(pc_x, ' (',var_explained, '%)'))   %>%
    select(-var_explained) %>%
    left_join(var_explained_df %>%
                  mutate(PC = paste0('PC', PC), 
                         var_explained = round(100*var_explained,
                                               digits = 1)),
              by = c('pc_y' = 'PC')) %>%
    mutate(pc_y = paste0(pc_y, ' (',var_explained, '%)'))   %>%
    select(-var_explained) ->
    grid_plot
## Warning in left_join(., coordinates %>% gather(pc_y, pc_y_val, -model, -scenario), : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 2 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
p_coefs2 <- ggplot(data = grid_plot) +
    geom_point(mapping = aes(x = pc_x_val, y = pc_y_val,
                             color = model, shape = scenario)) +
    #  geom_point(data = grid_plot %>% filter(model %in% coefficient_min$subset),
    #            mapping = aes(x = pc_x_val, y = pc_y_val, color = model, shape = scenario),
    #            size = 2) +
     geom_point(data = grid_plot %>% filter(model %in% coefficient_min$subset),
               mapping = aes(x = pc_x_val, y = pc_y_val),
               color = 'black', shape = 0, size = 2) +
    ggtitle(paste('Projection of ESM data into the first five eigenvectors, cut 6 model\n',
                  fig_dir)) +
    facet_grid(pc_y ~ pc_x, switch = 'both') +
    theme(strip.text = element_text(size = axis_size),
          plot.title = element_text(size=title_size),
          legend.text = element_text(size = axis_size-4),
          legend.key.width = unit(0.3, 'cm')) +
    xlab('') + ylab('')

ggsave(paste0(fig_dir, 'full_data_coeff_grid_plot_cut6_selected.png'), width = 8, height =6, units = 'in')
ggsave(paste0(fig_dir, 'full_data_coeff_grid_plot_cut6_selected.jpg'), width = 8, height =6, units = 'in')

p_coefs2